###############################################################################-
# Created By: Pietryka
# Creation Date:  2017-03-15
# purpose: extract the innovative text from state constitutions
# Contact: mpietryka@fsu.edu
###############################################################################-



# 1. LOAD DATA AND PACKAGES   ----------------

## 1A. LOAD PACKAGES  ------------

library(tidyverse)   # DATA CLEANING FUNCTIONS
library(stringr)     # STRING FUNCTIONS



## 1B. LOAD DATA ----------------

# 'dyads_df' OBJECT DATA CREATED IN 'DataClean\SC-1- Dyadic Data.R'
dyads_df <- read_rds("../Data/Derived/dyads_df.rds")
attr(dyads_df, "source")

# 'corpus_df' OBJECT DATA CREATED IN 'DataClean\SC-2- Extract Corpora.R'
corpus_df <- read_rds("../Data/Derived/corpus_df.rds")
attr(corpus_df, "source")




# 2. EXTRACT ALL N-GRAMS ------------------

all_fivegrams <- corpus_df  %>%
  filter(topic == "full_text")  %>%
  select(fivegram)  %>%
  unnest(cols = c(fivegram))  %>%
  arrange(document_id, order_in_corpus)

all_threegrams <- corpus_df  %>%
  filter(topic == "full_text")  %>%
  select(threegram)  %>%
  unnest(cols = c(threegram))  %>%
  arrange(document_id, order_in_corpus)

all_sevengrams <- corpus_df  %>%
  filter(topic == "full_text")  %>%
  select(sevengram)  %>%
  unnest(cols = c(sevengram))  %>%
  arrange(document_id, order_in_corpus)

all_fivegrams_nostop <- corpus_df  %>%
  filter(topic == "full_text_nostop")  %>%
  select(fivegram)  %>%
  unnest(cols = c(fivegram))  %>%
  arrange(document_id, order_in_corpus)


# 3. EXTRACT UNIQUE N-GRAMS ----------------------------------------
## (THOSE USED FOR THE FIRST TIME IN THIS DOCUMENT)

original_fivegrams <- all_fivegrams  %>%
  distinct(ngram, .keep_all = TRUE)  %>%
  select(document_id, ngram)  %>%
  mutate(is_innovative = 1L)

original_threegrams <- all_threegrams  %>%
  distinct(ngram, .keep_all = TRUE)  %>%
  select(document_id, ngram)  %>%
  mutate(is_innovative = 1L)

original_sevengrams <- all_sevengrams  %>%
  distinct(ngram, .keep_all = TRUE)  %>%
  select(document_id, ngram)  %>%
  mutate(is_innovative = 1L)

original_fivegrams_nostop <- all_fivegrams_nostop  %>%
  distinct(ngram, .keep_all = TRUE)  %>%
  select(document_id, ngram)  %>%
  mutate(is_innovative = 1L)

# 4. UNNESTED DATA, N-GRAM LEVEL DATA (LONG) --------------------

newtext_df <- corpus_df  %>%
  select(topic, fivegram)  %>%
  unnest(cols = c(fivegram))  %>%
  # KEEP ONLY FIRST TIME IN HISTORY NGRAM IS USED
  right_join(original_fivegrams, by = c("document_id", "ngram"))  %>%
  # KEEP ONLY FIRST OF EACH FIVE NGRAMS IN DOCUMENT
  filter(order_in_corpus %in% seq(1, max(order_in_corpus), 5))  %>%
  # PASTE UNIQUE TEXT INTO NEW NESTED COLUMN
  group_by(topic, document_id)  %>%
  nest()  %>%
  mutate(ngram = map(data, `[[`, "ngram"))  %>%
  mutate(newtext = map_chr(ngram, paste, collapse = " "))  %>%
  select(topic, document_id, newtext)

newtext_df3 <- corpus_df  %>%
  select(topic, threegram)  %>%
  unnest(cols = c(threegram))  %>%
  # KEEP ONLY FIRST TIME IN HISTORY NGRAM IS USED
  right_join(original_threegrams, by = c("document_id", "ngram"))  %>%
  # KEEP ONLY FIRST OF EACH THREE NGRAMS IN DOCUMENT
  filter(order_in_corpus %in% seq(1, max(order_in_corpus), 3))  %>%
  # PASTE UNIQUE TEXT INTO NEW NESTED COLUMN
  group_by(topic, document_id)  %>%
  nest()  %>%
  mutate(ngram = map(data, `[[`, "ngram"))  %>%
  mutate(newtext = map_chr(ngram, paste, collapse = " "))  %>%
  select(topic, document_id, newtext)

newtext_df7 <- corpus_df  %>%
  select(topic, sevengram)  %>%
  unnest(cols = c(sevengram))  %>%
  # KEEP ONLY FIRST TIME IN HISTORY NGRAM IS USED
  right_join(original_sevengrams, by = c("document_id", "ngram"))  %>%
  # KEEP ONLY FIRST OF EACH SEVEN NGRAMS IN DOCUMENT
  filter(order_in_corpus %in% seq(1, max(order_in_corpus), 7))  %>%
  # PASTE UNIQUE TEXT INTO NEW NESTED COLUMN
  group_by(topic, document_id)  %>%
  nest()  %>%
  mutate(ngram = map(data, `[[`, "ngram"))  %>%
  mutate(newtext = map_chr(ngram, paste, collapse = " "))  %>%
  select(topic, document_id, newtext)

newtext_df_nostop <- corpus_df  %>%
  select(topic, fivegram)  %>%
  unnest(cols = c(fivegram))  %>%
  # KEEP ONLY FIRST TIME IN HISTORY NGRAM IS USED
  right_join(original_fivegrams_nostop, by = c("document_id", "ngram"))  %>%
  # KEEP ONLY FIRST OF EACH FIVE NGRAMS IN DOCUMENT
  filter(order_in_corpus %in% seq(1, max(order_in_corpus), 5))  %>%
  # PASTE UNIQUE TEXT INTO NEW NESTED COLUMN
  group_by(topic, document_id)  %>%
  nest()  %>%
  mutate(ngram = map(data, `[[`, "ngram"))  %>%
  mutate(newtext = map_chr(ngram, paste, collapse = " "))  %>%
  select(topic, document_id, newtext)



# 5. EXTRACT BORROWED FIVEGRAMS (THOSE USED IN A PREVIOUS DOCUMENT) -----------
borrowed_fivegrams <- corpus_df  %>%
  filter(topic == "full_text")  %>%
  select(fivegram)  %>%
  unnest(cols = c(fivegram))  %>%
  select(document_id, ngram)  %>%
  anti_join(original_fivegrams) %>%
  mutate(is_innovative = 0L)

borrowed_threegrams <- corpus_df  %>%
  filter(topic == "full_text")  %>%
  select(threegram)  %>%
  unnest(cols = c(threegram))  %>%
  select(document_id, ngram)  %>%
  anti_join(original_threegrams) %>%
  mutate(is_innovative = 0L)

borrowed_sevengrams <- corpus_df  %>%
  filter(topic == "full_text")  %>%
  select(sevengram)  %>%
  unnest(cols = c(sevengram))  %>%
  select(document_id, ngram)  %>%
  anti_join(original_sevengrams) %>%
  mutate(is_innovative = 0L)


borrowed_fivegrams_nostop <- corpus_df  %>%
  filter(topic == "full_text_nostop")  %>%
  select(fivegram)  %>%
  unnest(cols = c(fivegram))  %>%
  select(document_id, ngram)  %>%
  anti_join(original_fivegrams_nostop) %>%
  mutate(is_innovative = 0L)



# 6. SAVE ------------------

write_rds(all_fivegrams, path = "../Data/Derived/all_fivegrams.rds")
write_rds(original_fivegrams, path = "../Data/Derived/original_fivegrams.rds")
write_rds(borrowed_fivegrams, path = "../Data/Derived/borrowed_fivegrams.rds")

write_rds(all_threegrams, path = "../Data/Derived/all_threegrams.rds")
write_rds(original_threegrams, path = "../Data/Derived/original_threegrams.rds")
write_rds(borrowed_threegrams, path = "../Data/Derived/borrowed_threegrams.rds")

write_rds(all_sevengrams, path = "../Data/Derived/all_sevengrams.rds")
write_rds(original_sevengrams, path = "../Data/Derived/original_sevengrams.rds")
write_rds(borrowed_sevengrams, path = "../Data/Derived/borrowed_sevengrams.rds")


write_rds(all_fivegrams_nostop,
          path = "../Data/Derived/all_fivegrams_nostop.rds")
write_rds(original_fivegrams_nostop,
          path = "../Data/Derived/original_fivegrams_nostop.rds")
write_rds(borrowed_fivegrams_nostop,
          path = "../Data/Derived/borrowed_fivegrams_nostop.rds")


# DISPLAY VERSION NUMBERS FOR R & PACKAGES IN USE
sessionInfo()
